#coding=utf-8

import bottleneck as bn

import time
from itertools import izip
import threading
import numpy
import logging
import math
from gensim import matutils
from gensim.models import Word2Vec
import os
from abc import ABCMeta, abstractmethod
import copy
import warnings
from scipy.sparse import csc_matrix, lil_matrix

try:
    from queue import Queue
except ImportError:
    from Queue import Queue

import joblib

logger = logging.getLogger("dcnn.dcnn")

try:
    raise Exception
    import pyximport
    import numpy as np

    pyximport.install(setup_args={
        "include_dirs": [
        '/usr/include',
        '/System/Library/Frameworks/vecLib.framework/Versions/A/Headers',
        np.get_include()],
        "extra_compile_args": ["-O2"],
        "extra_link_args": ["-Wl,-O1", "-Wl,--as-needed"],
        "libraries": [('blas', {})],
        }, reload_support=True)

    import dcnn_inner
    FLOAT = dcnn_inner.FLOAT_TYPE
    INT = dcnn_inner.INT_TYPE
    U_INT = dcnn_inner.U_INT_TYPE

    softmax = dcnn_inner.softmax_fast
    sigmoid = dcnn_inner.sigmoid_fast
    tanh = dcnn_inner.tanh_fast
    nonlinear_func = sigmoid
    convolve2d = dcnn_inner.convolve2d_fast
    input_layer_backward = dcnn_inner.input_layer_backward_fast
    wide_convolution_layer_forward = dcnn_inner.wide_convolution_layer_forward_fast
    wide_convolution_layer_backward = dcnn_inner.wide_convolution_layer_backward_fast
    k_max_pooling_image = dcnn_inner.k_max_pooling_image_fast
    k_max_pooling_backward = dcnn_inner.k_max_pooling_backward_fast
    folding_image = dcnn_inner.folding_image_fast
except:
    import numpy as np
    from dcnn_inner_replacements import *
    FLOAT = FLOAT_TYPE
    INT = INT_TYPE
    U_INT = U_INT_TYPE

class Vocab(object):
    def __init__(self, **kwargs):
        self.count = 0
        self.__dict__.update(kwargs)

    def __lt__(self, other):
        return self.count < other.count

    def __str__(self):
        vals = ['%s:%r' % (key, self.__dict__[key]) for key in sorted(self.__dict__) if not key.startswith('_')]
        return "<" + ', '.join(vals) + ">"


class LineSentence(object):
    """Simple format: one sentence = one line; words already preprocessed and separated by whitespace."""

    def __init__(self, source, repeat=1):
        """
        `source` can be either a string or a file object.
        Example:
            sentences = LineSentence('myfile.txt')
        """
        self.source = source
        self.repeat = repeat

    def __iter__(self):
        """Iterate through the lines in the source."""
        for i in xrange(self.repeat):
            with open(self.source, 'rb') as fin:
                for line in fin:
                    yield to_unicode(line).split()


def to_unicode(text, encoding='utf8', errors='strict'):
    """Convert a string (bytestring in `encoding` or unicode), to unicode."""
    if isinstance(text, unicode):
        return text
    return unicode(text, encoding, errors=errors)


def build_vocab(sentences, min_count):
    """
    Build vocabulary from a sequence of sentences (can be a once-only generator stream).
    """
    sentence_no, vocab = -1, {}
    total_words = 0
    for sentence_no, sentence in enumerate(sentences):
        if sentence_no % 10000 == 0:
            logger.info("PROGRESS: at sentence #%i, processed %i words and %i word types" %
                        (sentence_no, total_words, len(vocab)))
        for word in sentence:
            total_words += 1
            if word in vocab:
                vocab[word].count += 1
            else:
                vocab[word] = Vocab(count=1)
    logger.info("collected %i word types from a corpus of %i words and %i sentences" %
                (len(vocab), total_words, sentence_no + 1))
    # assign a unique index to each word

    filted_vocab, index2word = {}, []
    for word, v in vocab.items():
        if v.count >= min_count:
            v.index = len(filted_vocab)
            index2word.append(word)
            filted_vocab[word] = v
    logger.info("total %i word types after removing those with count<%s" % (len(filted_vocab), min_count))
    return filted_vocab, index2word


def sentence2indexes(sentence, vocab, len_of_sentence_limit):
    indexes = []
    for i, word in enumerate(sentence):
        if word in vocab:
            if i < len_of_sentence_limit:
                indexes.append(vocab[word].index)
            else:
                break
    return indexes


class Layer(object):
    __metaclass__ = ABCMeta

    @abstractmethod
    def forward(self, input, sentence_len=None, train_or_not=True):
        pass

    @abstractmethod
    def backward(self, delta):
        pass

    @abstractmethod
    def update(self, alpha):
        pass

    @abstractmethod
    def reset_grads(self):
        pass


class InputLayer(Layer):
    def __init__(self, rng, vocab, wordvec_dim, len_of_sentence_limit, decay, dropout_rate=0.2):
        self.rng = rng
        self.vocab = vocab
        self.wordvec_dim = wordvec_dim
        self.len_of_sentence_limit = len_of_sentence_limit
        self.decay = decay
        self.dropout_rate = dropout_rate
        self.W = numpy.empty((self.wordvec_dim, len(self.vocab)), dtype=FLOAT)
        fan_in = len(self.vocab)
        fan_out = self.wordvec_dim
        W_bound = 4 * numpy.sqrt(6. / (fan_in + fan_out))
        for i in xrange(len(self.vocab)):
            #self.W[:, i] = (self.rng.rand(self.wordvec_dim) - 0.5) / self.wordvec_dim
            self.W[:, i] = numpy.asarray(self.rng.uniform(low=-W_bound, high=W_bound, size=(self.wordvec_dim,)),
                                         dtype=FLOAT)
        self.params = [self.W, ]
        self.reset_grads()

    def forward(self, input, sentence_len=None, train_or_not=False):
        # output is with shape of (1, self.wordvec_dim, len(indexes))
        self.train_or_not = train_or_not
        if len(input) == 0:
            raise ValueError('len(indexes) == 0')
        self.indexes = input
        output = self.W[:, self.indexes]
        # dropout
        if train_or_not:
            self.dropout_rows, self.dropout_cols = dcnn_inner.dropout_sample_2d(output.shape[0], output.shape[1], self.dropout_rate)
            output[self.dropout_rows, self.dropout_cols] = 0.
        self.output = np.expand_dims(output, axis=0)
        return self.output

    def backward(self, delta):
        # dropout
        if self.train_or_not:
            delta[0, self.dropout_rows, self.dropout_cols] = 0.
        # delta is with shape of (1, self.wordvec_dim, len(indexes))
        indexes = numpy.asarray(self.indexes, dtype=U_INT)
        #self.grads[0] = self.grads[0] + csc_matrix(input_layer_backward(delta, decay, self.W, indexes), dtype=FLOAT)
        #self.grads[0] += input_layer_backward(delta, decay, self.W, indexes)
        dcnn_inner.input_layer_backward_special(delta, self.decay, self.W, indexes, self.grads[0])

    def update(self, alpha):
        #self.W -= alpha * self.grads[0]
        for k, v in self.grads[0].iteritems():
            self.W[:, k] -= alpha * v
        self.params = [self.W]
        self.reset_grads()

    def reset_grads(self):
        #self.grads = [csc_matrix(self.W.shape, dtype=FLOAT), ]
        #self.grads = [np.zeros(self.W.shape, dtype=FLOAT), ]
        self.grads = [{}, ]


class WideConvolutionLayer(Layer):
    def __init__(self, rng, window_size, n_filters, n_feature_maps, decay, linear=True, back_linear=True):
        self.window_size = window_size
        self.n_filters = n_filters
        self.n_feature_maps = n_feature_maps
        self.rng = rng
        self.linear = linear
        self.back_linear = back_linear
        self.decay = decay
        fan_in = window_size * n_feature_maps
        fan_out = n_filters * window_size
        W_bound = 4 * numpy.sqrt(6. / (fan_in + fan_out))
        self.W = numpy.asarray(self.rng.uniform(low=-W_bound, high=W_bound, \
                                                size=(n_feature_maps, n_filters, 1, window_size)), dtype=FLOAT)
        self.b = numpy.zeros((self.n_filters,), dtype=FLOAT)
        self.params = [self.W, self.b]
        self.reset_grads()

    def forward(self, input, sentence_len=None, train_or_not=False):
        # input should be with shape of (n_feature_maps, n_feature_rows, n_feature_cols)

        # output should be with shape of (n_filters, self.n_feature_rows, self.n_feature_cols+self.window_size-1)

        assert self.n_feature_maps == input.shape[0]
        self.n_feature_rows, self.n_feature_cols = input.shape[1:]
        self.input = input
        if self.linear:
            self.output = wide_convolution_layer_forward(input, self.window_size, self.n_filters, self.W, self.b)
        else:
            self.output = nonlinear_func(
                wide_convolution_layer_forward(input, self.window_size, self.n_filters, self.W, self.b))
        return self.output

    def backward(self, delta):
        # grad should be with shape of (n_filters, self.n_feature_rows, self.n_feature_cols+self.window_size-1)
        self.back_grad, grad_W, grad_b = wide_convolution_layer_backward(self.input, self.W, self.b, delta, self.decay,
                                                                         self.back_linear)
        self.grads[0] += grad_W
        self.grads[1] += grad_b
        return self.back_grad

    def update(self, alpha):
        self.W -= alpha * self.grads[0]
        self.b -= alpha * self.grads[1]
        self.params = [self.W, self.b]
        self.reset_grads()

    def reset_grads(self):
        self.grads = [numpy.zeros(self.W.shape, dtype=FLOAT), numpy.zeros(self.b.shape, dtype=FLOAT)]


class DynamicKMaxPooling(Layer):
    def __init__(self, n_feature_maps, n_feature_rows, total_num_of_conv_layers,
                 current_num_of_conv_layer, k_top, linear=False, back_linear=True, decay=None):
        self.total_num_of_conv_layers = total_num_of_conv_layers
        self.current_num_of_conv_layer = current_num_of_conv_layer
        self.k_top = k_top
        self.n_feature_maps = n_feature_maps
        self.n_feature_rows = n_feature_rows
        self.linear = linear
        self.back_linear = back_linear
        self.decay = decay
        self.p = (self.total_num_of_conv_layers - self.current_num_of_conv_layer) * 1.0 / self.total_num_of_conv_layers
        self.b = numpy.zeros((n_feature_maps, n_feature_rows), dtype=FLOAT)
        self.params = [self.b, ]
        self.reset_grads()

    def forward(self, input, sentence_len=None, train_or_not=False):
        # input should be with shape of (n_feature_maps, n_feature_rows, n_feature_cols)

        # output should be with shape of (n_feature_maps, n_feature_rows, k)
        assert sentence_len != None
        assert self.n_feature_maps == input.shape[0]
        assert self.n_feature_rows == input.shape[1]
        self.feature_cols = input.shape[2]
        self.input = input
        self.k = max(self.k_top, math.ceil(self.p * sentence_len))
        self.output = numpy.empty((self.n_feature_maps, self.n_feature_rows, self.k), dtype=FLOAT)
        self.kmax_index = numpy.empty((self.n_feature_maps, self.n_feature_rows, self.k), dtype=U_INT)
        for i in range(self.n_feature_maps):
            new_image, self.kmax_index[i, :, :] = k_max_pooling_image(input[i,:,:], self.k, self.b[i])
            if self.linear:
                self.output[i, :, :] = new_image
            else:
                self.output[i, :, :] = nonlinear_func(new_image)

        return self.output

    def backward(self, delta):
        # grad should be with shape of (self.n_feature_maps, n_feature_rows, k)
        if delta.ndim != 3: # output_layer->this layer
            delta = delta.reshape((self.n_feature_maps, self.n_feature_rows, self.k))
        self.back_grad, grad_b = k_max_pooling_backward(delta.astype(FLOAT), self.input, self.kmax_index,
                                                        self.back_linear)
        self.grads[0] += grad_b
        return self.back_grad

    def update(self, alpha):
        self.b -= alpha * self.grads[0]
        self.params = [self.b]
        self.reset_grads()

    def reset_grads(self):
        self.grads = [numpy.zeros(self.b.shape, dtype=FLOAT)]


class Folding(Layer):
    def __init__(self, n_feature_maps, back_linear=False):
        self.n_feature_maps = n_feature_maps
        self.back_linear = back_linear
        self.params = []
        self.reset_grads()

    def forward(self, input, sentence_len=None, train_or_not=False):
        # input should be with shape of (n_feature_maps, n_feature_rows, n_feature_cols)

        assert self.n_feature_maps == input.shape[0]
        self.n_feature_rows, self.n_feature_cols = input.shape[1:]
        assert self.n_feature_rows % 2 == 0
        self.output = numpy.empty((self.n_feature_maps, self.n_feature_rows / 2, self.n_feature_cols), dtype=FLOAT)
        for i in xrange(self.n_feature_maps):
            self.output[i, :, :] = folding_image(input[i, :, :])
        self.input = input
        return self.output

    def backward(self, grad):
        # grad should be with shape of (n_feature_maps, n_feature_rows/2, n_feature_cols)

        f = numpy.ones(2).reshape((2, 1))
        self.back_grad = numpy.empty(self.input.shape, dtype=FLOAT)
        for i in xrange(self.n_feature_maps):
            if not self.back_linear:
                self.back_grad[i, :, :] = numpy.kron(grad[i, :, :], f) * self.input[i, :, :] * (1 - self.input[i, :, :])
            else:
                self.back_grad[i, :, :] = numpy.kron(grad[i, :, :], f)
        return self.back_grad

    def reset_grads(self):
        self.grads = []

    def update(self, alpha):
        pass


class FullConnectedLayer(Layer):
    def __init__(self, rng, n_feature_maps, n_feature_rows, n_feature_cols, layer_size, decay, back_linear=True, dropout_rate=0.5):
        self.rng = rng
        self.n_feature_maps = n_feature_maps
        self.n_feature_rows = n_feature_rows
        self.n_feature_cols = n_feature_cols
        self.layer_size = layer_size
        self.decay = decay
        self.back_linear = back_linear
        self.dropout_rate = dropout_rate
        fan_in = self.n_feature_rows * self.n_feature_cols * self.n_feature_maps
        fan_out = self.layer_size
        W_bound = 4 * numpy.sqrt(6. / (fan_in + fan_out))
        self.W = numpy.asarray(rng.uniform(low=-W_bound, high=W_bound, \
                                           size=(self.n_feature_maps * self.n_feature_rows * self.n_feature_cols,
                                                 self.layer_size)), dtype=FLOAT)
        self.b = numpy.zeros((self.layer_size,), dtype=FLOAT)
        self.params = [self.W, self.b]
        self.reset_grads()

    def forward(self, input, sentence_len=None, train_or_not=False):
        # input should be with shape of (n_feature_maps, feature_width, feature_height)
        # output should be with shape of (layer_size,)
        assert self.n_feature_maps == input.shape[0]
        assert self.n_feature_rows == input.shape[1]
        assert self.n_feature_cols == input.shape[2]
        # self.output = numpy.zeros((self.layer_size), dtype=FLOAT)
        self.output = numpy.dot(input.flatten(), self.W)
        self.output = nonlinear_func(self.output + self.b)
        self.input = input
        self.train_or_not = train_or_not
        # dropout
        if train_or_not:
            self.dropout = dcnn_inner.dropout_sample_1d(self.layer_size, self.dropout_rate)
            self.output[self.dropout] = 0.
        return self.output

    def backward(self, delta):
        # dropout code
        # start
        if self.train_or_not:
            delta[self.dropout] = 0.
        # end
        self.back_grad = numpy.dot(delta, self.W.T)
        if not self.back_linear:
            self.back_grad = self.back_grad * self.input.flatten() * (1 - self.input.flatten())
        self.back_grad = self.back_grad.reshape((self.n_feature_maps, self.n_feature_rows, self.n_feature_cols))
        #grad_W = numpy.outer(self.input.flatten(), delta).reshape(self.W.shape) + self.decay * self.W
        grad_W = numpy.outer(self.input.flatten(), delta).reshape(self.W.shape)
        grad_b = delta
        self.grads[0] += grad_W
        self.grads[1] += grad_b
        return self.back_grad

    def update(self, alpha):
        self.W -= alpha * self.grads[0]
        self.b -= alpha * self.grads[1]
        self.params = [self.W, self.b]
        self.reset_grads()

    def reset_grads(self):
        self.grads = [numpy.zeros(self.W.shape, dtype=FLOAT), numpy.zeros(self.b.shape, dtype=FLOAT)]


class OutputLayer(Layer):
    def __init__(self, rng, input_layer_size, layer_size, decay, back_linear=False):
        self.rng = rng
        self.input_layer_size = input_layer_size
        self.layer_size = layer_size
        self.decay = decay
        self.back_linear = back_linear
        assert back_linear==False
        W_bound = 4 * numpy.sqrt(6. / (self.input_layer_size + self.layer_size))
        self.W = numpy.asarray(rng.uniform(low=-W_bound, high=W_bound, \
                                           size=(self.input_layer_size, self.layer_size)), dtype=FLOAT)
        self.b = numpy.zeros((self.layer_size,), dtype=FLOAT)
        self.params = [self.W, self.b]
        self.reset_grads()
        self.negative_log_likelihood = 0.

    def forward(self, input, sentence_len=None, y=None, train_or_not=False):
        # input should be with shape of (full_con_layer_size, )
        if input.ndim > 1:
            input = input.flatten()
        assert self.input_layer_size == input.shape[0]
        self.output = softmax(numpy.dot(input, self.W) + self.b)
        self.input = input
        if y != None:
            y_target = numpy.zeros((self.layer_size,), dtype=INT)
            y_target[y] = 1
            # self.negative_log_likelihood = numpy.sum(-(y_target*numpy.log(self.output)+(1-y_target)*numpy.log(1-self.output)))

            self.negative_log_likelihood = -numpy.log(self.output[y])
            self.grad = self.output - y_target
        else:
            self.negative_log_likelihood = None
            self.grad = None
        return self.output

    def backward(self, delta=None):
        if delta == None:
            delta = self.grad
        self.back_grad = numpy.dot(delta, self.W.T) * self.input * (1 - self.input)
        #grad_W, grad_b = numpy.outer(self.input, delta) + self.decay * self.W, delta
        grad_W, grad_b = numpy.outer(self.input, delta), delta
        self.grads[0] += grad_W
        self.grads[1] += grad_b
        return self.back_grad

    def update(self, alpha):
        self.W -= alpha * self.grads[0]
        self.b -= alpha * self.grads[1]
        self.params = [self.W, self.b]
        self.reset_grads()

    def reset_grads(self):
        self.grads = [numpy.zeros(self.W.shape, dtype=FLOAT), numpy.zeros(self.b.shape, dtype=FLOAT)]


class DCNNBase(object):
    def __init__(self, layers, alpha=0.025, min_alpha=0.00001, alpha_m=0.99999, entropy_descent_m=0.995,
                 decay=1e-4, workers=1, model_file="../model/dcnn_model.pkl"):
        self.layers = layers
        self.vocab = layers[0].vocab
        self.wordvec_dim = layers[0].wordvec_dim
        self.len_of_sentence_limit = layers[0].len_of_sentence_limit
        self.num_of_layers = len(layers)
        self.output_layer_size = layers[-1].layer_size
        self.alpha = alpha
        self.min_alpha = min_alpha
        self.alpha_m = alpha_m
        self.entropy_descent_m = entropy_descent_m
        self.decay = decay
        self.workers = workers
        self.model_file = model_file

    def forward(self, sentence, train_or_not=False):
        indexes = sentence2indexes(sentence, self.vocab, self.len_of_sentence_limit)
        if len(indexes) == 0:
            return None
        output = self.layers[0].forward(indexes, len(indexes), train_or_not=train_or_not)
        for layer in self.layers[1:]:
            output = layer.forward(output, len(indexes), train_or_not=train_or_not)
        self.output = numpy.clip(output, 0.00000001, 0.99999999)
        return output

    def get_delta_and_negative_log_likelihood(self, y):
        y_target = numpy.zeros((self.output_layer_size,), dtype=FLOAT)
        y_target[y] = 1.0
        self.negative_ll = numpy.clip(-math.log(self.output[y]), 0.00000001, 10.)
        if self.negative_ll == numpy.nan:
            print self.output
        self.delta = self.output - y_target
        return self.delta, self.negative_ll

    def get_loss(self, y):
        return self.get_delta_and_negative_log_likelihood(y)[1]

    def backward(self, y):
        delta, neg_ll = self.get_delta_and_negative_log_likelihood(y)
        for layer in self.layers[::-1]:
            delta = layer.backward(delta)
        return neg_ll

    def update(self):
        for layer in self.layers[::-1]:
            layer.update(self.alpha)

    def accuracy(self, sentences, labels):
        acc = 0.0
        for i, (sentence, y) in enumerate(izip(sentences, labels)):
            output = np.argmax(self.forward(sentence)) == y
            if output:
                acc += 1
        return acc * 1.0 / (i + 1)

    def negative_log_likelihood(self, sentences, labels):
        assert self.vocab != None
        entropy = 0.0
        for i, (sentence, y) in enumerate(izip(sentences, labels)):
            self.forward(sentence)
            entropy += self.get_delta_and_negative_log_likelihood(y)[1]
        return entropy / (i + 1)

    def get_params(self):
        params = []
        for layer in self.layers:
            params += layer.params
        return params

    def set_params(self, params):
        num_of_params = 0
        for layer in self.layers:
            num_of_params += len(layer.params)
        assert num_of_params == len(params)
        j = 0
        for layer in self.layers:
            l = len(layer.params)
            for i in xrange(l):
                layer.params[i] = params[j]
                j += 1

    def get_grads(self):
        grads = []
        for layer in self.layers:
            grads += layer.grads
        return grads

    def set_grads(self, grads):
        num_of_grads = 0
        for layer in self.layers:
            num_of_grads += len(layer.grads)
        assert num_of_grads == len(grads)
        j = 0
        for layer in self.layers:
            l = len(layer.grads)
            for i in xrange(l):
                layer.grads[i] = grads[j]
                j += 1

    def reset_grads(self):
        for layer in self.layers:
            layer.reset_grads()

    def get_layers(self):
        return self.layers

    def set_layers(self, layers):
        self.layers = layers

    def save(self):
        logger.info("saving model...")
        joblib.dump(self, self.model_file)

    @classmethod
    def load(cls, model_file):
        return joblib.load(model_file)

    def train(self, train_sentences, train_labels, validate_sentences, validate_labels,
              validate_freq=1000, chunksize=10, patience=5, verbose_freq=200, max_entropy_allowed=0.20):

        jobs = Queue(maxsize=2 * self.workers)
        lock = threading.Lock()
        entropy_total = [0.0, ]
        sentence_count = [0, ]
        start, last_report, next_report = [time.time(), ], [time.time(), ], [1.0]
        best_validate_entropy = [999.0,]
        continue_or_not = [True,]
        logger.info("training with %d workers" % self.workers)
        not_passed = [0,]
        best_params = [None,]
        update_times = [0.,]
        loops = 0

        # TODO fix bugs
        if self.workers > 1:
            raise NotImplementedError
            '''def worker_train(model):
                while True:
                    job = jobs.get()
                    if job is None:
                        break
                    for sentence, y, k in job:
                        model.forward(sentence, train_or_not=True)
                        entropy_total[0] += model.backward(y)
                    with lock:
                        self.set_grads(model.get_grads())
                        self.update()
                        model.alpha *= self.alpha_m
                        model.set_params(self.get_params())
                        model.reset_grads()
                        self.alpha = model.alpha
                        sentence_count[0] += len(job)
                        update_times[0] += 1

                        if update_times[0] % validate_freq == 0:
                            validate_entropy = self.negative_log_likelihood(validate_sentences, validate_labels)
                            if validate_entropy > best_validate_entropy[0] * self.entropy_descent_m:
                                if not_passed[0] >= patience:
                                    continue_or_not[0] = False
                                else:
                                    not_passed[0] += 1
                            else:
                                not_passed[0] = 0
                            if entropy_total[0] / sentence_count[0] > max_entropy_allowed:
                                not_passed[0] = 0
                                continue_or_not[0] = True
                            if validate_entropy < best_validate_entropy[0]:
                                best_validate_entropy[0] = validate_entropy
                                best_params[0] = copy.deepcopy(self.get_params())
                            logger.info("validate entropy:%f, best validate entropy:%f"
                                        %(validate_entropy, best_validate_entropy[0]))
                            if self.alpha < self.min_alpha:
                                continue_or_not[0] = False
                        if sentence_count[0] % verbose_freq == 0:
                            logger.info("loops:%d, trained sentences:%d, speed:%0.2f sentences/s, alpha:%0.6f, entropy:%0.6f" \
                                        %(k, sentence_count[0], sentence_count[0]/(time.time()-start[0]),
                                          self.alpha, entropy_total[0]/sentence_count[0]))

            while True:
                if not continue_or_not[0]:
                    break
                loops += 1
                entropy_total[0] = 0.
                sentence_count[0] = 0
                start[0] = time.time()

                workers = [threading.Thread(target=worker_train, args=(copy.deepcopy(self),)) for _ in
                           xrange(self.workers)]
                for thread in workers:
                    thread.daemon = True
                    thread.start()

                job = []
                for i, (sentence, y) in enumerate(izip(train_sentences, train_labels)):
                    job.append((sentence, y, loops))
                    if (i + 1) % chunksize == 0:
                        jobs.put(job)
                        job = []
                if len(job) > 0:
                    jobs.put(job)

                for _ in xrange(self.workers):
                    jobs.put(None)
                for thread in workers:
                    thread.join()'''

        else:
            while True:
                if not continue_or_not[0]:
                    break
                loops += 1
                tic = time.time()
                sentence_count[0] = 0
                entropy_total[0] = 0.
                for sentence, y in izip(train_sentences, train_labels):
                    sentence_count[0] += 1
                    self.forward(sentence, train_or_not=True)
                    entropy_total[0] += self.backward(y)
                    if sentence_count[0] % chunksize == 0:
                        self.update()
                        self.alpha *= self.alpha_m
                        update_times[0] += 1
                        if update_times[0] % validate_freq == 0:
                            validate_entropy = self.negative_log_likelihood(validate_sentences, validate_labels)
                            if validate_entropy > best_validate_entropy[0] * self.entropy_descent_m:
                                if not_passed[0] >= patience:
                                    continue_or_not[0] = False
                                else:
                                    not_passed[0] += 1
                            else:
                                not_passed[0] = 0
                            if entropy_total[0] / sentence_count[0] > max_entropy_allowed:
                                not_passed[0] = 0
                                continue_or_not[0] = True
                            if validate_entropy < best_validate_entropy[0]:
                                best_validate_entropy[0] = validate_entropy
                                best_params[0] = copy.deepcopy(self.get_params())
                            logger.info("validate entropy:%f, best validate entropy:%f"
                                %(validate_entropy, best_validate_entropy[0]))
                            if self.alpha < self.min_alpha:
                                continue_or_not[0] = False
                    if sentence_count[0] % verbose_freq == 0:
                        logger.info("loops:%d, trained sentences:%d, speed:%0.2f sentences/s, alpha:%0.6f, entropy:%0.6f" \
                                        %(loops, sentence_count[0], sentence_count[0]/(time.time()-tic),
                                          self.alpha, entropy_total[0]/sentence_count[0]))
                    if not continue_or_not[0]:
                        break

        self.set_params(best_params[0])
        acc = self.accuracy(validate_sentences, validate_labels)
        logger.info("validate accuracy: %f" % acc)
        self.save()


class DCNNDeep(DCNNBase):
    def __init__(self, wordvec_dim=48, output_layer_size=2, sentences=None, vocab=None, filter_width=[7, 5], full_con_layer_size=10,
                 n_filters=[2, 2], k_top=4, alpha=0.025, min_alpha=0.00001, alpha_m=0.99999, dropout_rate_in_hiddens=0.5,
                 dropout_rate_in_input=0.2, entropy_descent_m=0.995, seed=1, len_of_sentence_limit=500, decay=1e-4,
                 min_count=2, workers=1, pre_train_word_vec=False, pre_train_sentences=None, model_file="../model/dcnn_model.pkl"):
        self.wordvec_dim = wordvec_dim
        self.output_layer_size = output_layer_size
        self.sentences = sentences
        if wordvec_dim % (pow(2,len(filter_width))) != 0:
            raise ValueError("wordvec_dim // %d != 0" % (pow(2,len(filter_width))))
        word2vec = None
        if not pre_train_word_vec:
            if sentences != None:
                self.vocab, self.index2word = build_vocab(sentences, min_count=min_count)
            if sentences == None:
                assert vocab != None
                self.vocab = vocab
        else:
            if pre_train_sentences != None:
                logger.info("training word2vec using unsupervised method...")
                word2vec = Word2Vec(sentences=pre_train_sentences, size=self.wordvec_dim, workers=workers, min_count=min_count)
            else:
                raise ValueError
            self.vocab = word2vec.vocab
            self.index2word = word2vec.index2word
        self.filter_width = filter_width
        self.full_con_layer_size = full_con_layer_size
        self.n_filters = n_filters
        self.k_top = k_top
        self.seed = seed
        self.len_of_sentence_limit = len_of_sentence_limit
        self.min_count = min_count
        self.rng = np.random.RandomState(seed)
        layers = []
        input_layer = InputLayer(self.rng, self.vocab, self.wordvec_dim,
                                 self.len_of_sentence_limit, decay=decay,
                                 dropout_rate=dropout_rate_in_input)
        if word2vec != None:
            for i, word in enumerate(self.index2word):
                input_layer.W[:, i] = word2vec[word]
        layers.append(input_layer)
        assert len(filter_width) == len(n_filters)
        assert len(n_filters) > 1
        for i in xrange(len(n_filters)):
            convolve_n_feature_maps = 1 if i == 0 else n_filters[i-1]
            convolve_back_linear = True if i == 0 else False
            layers.append(WideConvolutionLayer(self.rng, filter_width[i],
                                               n_filters[i], convolve_n_feature_maps,
                                               back_linear=convolve_back_linear, decay=decay))
            layers.append(Folding(n_filters[i], back_linear=True))
            layers.append(DynamicKMaxPooling(n_filters[i], wordvec_dim/(pow(2,i+1)),
                                             len(n_filters), i+1, k_top,
                                             back_linear=True, decay=decay))
        layers.append(FullConnectedLayer(self.rng, n_filters[-1], wordvec_dim/(pow(2,i+1)), k_top,
                                         self.full_con_layer_size, decay=decay, back_linear=False,
                                         dropout_rate=dropout_rate_in_hiddens))
        layers.append(OutputLayer(self.rng, self.full_con_layer_size, output_layer_size, decay=decay))
        super(DCNNDeep, self).__init__(layers, alpha=alpha, min_alpha=min_alpha, alpha_m=alpha_m,
                                           entropy_descent_m=entropy_descent_m, decay=decay, workers=workers,
                                           model_file=model_file)